#!/usr/bin/perl -w

#
# Copyright 2005 John Carter and The Apache Software Foundation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

###############################################################################
#
# This script gets the list of wikinames from the usemod wiki,
# converts to moinmoin & uploads if different.
#
# Usage:
# 
# 1) Create a WikiMigrationBot user in your MoinMoin wiki.
# 2) Edit this script with the URLs of your UseMod and MoinMoin wikis, and
# with the password of the WikiMigrationBot user.
# 
# Execution:
# Run the script with a list of WikiNames to be ported, or without
# argument to port every page.
# 
# ie:
# ./wikiport.pl WikiPageToBePorted SomeOtherWikiPageToBePorted
#
#
# John Carter john@therefromhere.org
#
# http://www.therefromhere.org/software/wikiport/
# 
###############################################################################

use strict;

package UseModtoMoinMoinPort;

use WWW::Mechanize;
use encoding 'utf8';

# globals
my $UseModBaseUrl = 'http://user:pw@wiki.example.org/wiki.pl?';
my $MoinMoinBaseUrl = 'http://wikiport.example.org/wikiport/';

# Before running the script, manually create a WikiMigrationBot user on MoinMoin
my $BotUserName = 'WikiMigrationBot';
my $BotPassword = 'password';

# These pages are read-only in the default MoinMoin setup & so can't be ported.
# The only one of these we need to port manually is WikiName
# (the others are autogenerated by both UseMod and MoinMoin).
my @ImmutablePages = ( "CategoryCategory", "RecentChanges", "WikiName" );

my @UpdatedPages = ();
my @UnchangedPages = ();
my @CreatedPages = ();
my @BrokenMarkupPages = ();
my @UnCreatedPages = ();

my %ImageInTitle = ();
my %LinkInTitle = ();
my %WikiNameInTitle = ();
my %EntityInTitle = ();
my %EntityInLink = ();

my $Mech = WWW::Mechanize->new();

MoinMoinLogin();
PortWiki(@ARGV);
MoinMoinLogout();
ReportPageCounts("Unchanged", @UnchangedPages);
ReportPageCounts("Updated", @UpdatedPages);
ReportPageCounts("Created", @CreatedPages);
ReportPageCounts("Have Broken Markup", @BrokenMarkupPages);
ReportPageCounts("Couldn't be Created", @UnCreatedPages);

sub PortWiki
{
       my @wikiNameList = @_;
       my $pageCount;

       if (@wikiNameList)
       {
               $pageCount = @wikiNameList;
               print "$pageCount Pages to be ported (@wikiNameList)\n";
       }
       else
       {
               @wikiNameList = GetUseModIndex();
               $pageCount = @wikiNameList;
               print "$pageCount Pages to be ported (all pages bar @ImmutablePages)\n";
       }


       my $pageNum = 0;

       my $firstPage = ""; # used in debug
       foreach my $wikiName (@wikiNameList)
       {
               $pageNum++;
               if ($wikiName ge $firstPage)
               {
                       my $portedPage = PortPage($wikiName);

                       PostIfDiff($wikiName, $portedPage);

                       print " ($pageNum/$pageCount)\n";
               }
       }
}

sub GetUseModIndex
{
       #
       # Pass in the UseMod wiki script's url (including wiki.pl?)
       # Returns a list of all normal WikiPages.
       #
       # action=index returns a list of all pages, but it seems raw=1
       # can't be used on the index.
       #
       # So just grep for the links (they're one per line), and remove
       # the special case RecentChanges.
       #

       my $url = $UseModBaseUrl . 'action=index';

       $Mech->get( $url );
       my @indexList = split(/\n/, $Mech->content());
       @indexList = grep(/wikipagelink/, @indexList);          # link lines

       foreach my $ignore (@ImmutablePages)
       {
               @indexList = grep(!/$ignore/, @indexList);      # -ImmutablePages
       }
       @indexList = grep(!/wiki.pl\?action=/, @indexList);     # -action links

       foreach my $line (@indexList)
       {
               $line =~ /"wiki.pl\?(.*)"/;
               $line = $1;
       }

       return @indexList;
}

sub PortPage
{
       #
       # Get page $wikiName from the UseMod Wiki & convert it to MoinMoin.
       # Returns the converted page as a scalar.
       #

       # Reset the global error report hashes:
       %ImageInTitle = ();
       %LinkInTitle = ();
       %WikiNameInTitle = ();
       %EntityInTitle = ();
       %EntityInLink = ();

       my $wikiName = shift;
       my $useModUrl = $UseModBaseUrl . 'action=browse&raw=1&id=' . $wikiName;

       $Mech->get($useModUrl);

       my $wikiText = $Mech->content();
       use Encode qw( decode FB_CROAK );
       $wikiText = decode('iso-8859-1', $wikiText, FB_CROAK);
       use HTML::Entities qw( decode_entities );
       decode_entities($wikiText);

       my @wikiTextList = split(/\n/, $wikiText);
       die "Couldn't get $useModUrl" unless @wikiTextList;

       my $lineNum = 0;
       foreach my $line (@wikiTextList)
       {
               $line = UseModtoMoinMoinLine($line, $lineNum);
               $lineNum++;
       }

       $wikiText = join ("\n", @wikiTextList);

       my $brokenMarkupReport = CreateBrokenMarkupReport();

       if ($brokenMarkupReport ne "")
       {
               push (@BrokenMarkupPages, $wikiName);
       }

       $wikiText = $wikiText . $brokenMarkupReport . "\n";     # Add a final \n to match MoinMoin raw.

       return $wikiText;
}

sub UseModtoMoinMoinLine
{
       my $line = shift;
       my $lineNum = shift;

       # Detect titles - special handling needed, because UseMod allows
       # images and links in titles, while MoinMoin doesn't.

       my $isTitle = 0;
       if ($line =~ /^\=.*\=\r$/)
       {
               #= Title =
               $isTitle = 1;
       }
       elsif ($line =~ /^ .*:/)
       {
               # Subtitle:
               $isTitle = 1;
       }

       #
       # Basic Formatting
       #

       # MoinMoin doesn't support line breaks
       $line =~ s/\s*<br>\s*/\n\n/;

       # MoinMoin is more strict than UseMod about title formatting
       $line =~ s/^\=\=\=([^=]+)\=+(\s)+$/\=\=\=$1\=\=\=\r/;
       $line =~ s/^\=\=([^=]+)\=+(\s)+$/\=\=$1\=\=\r/;
       $line =~ s/^\=([^=]+)\=+(\s)+$/\=$1\=\r/;

       $line =~ s/<\/?b>/\'\'\'/g;             # bold
       $line =~ s/<\/?strong>/\'\'\'/g;        # strong=bold

       $line =~ s/<\/?i>/\'\'/g;               # italic
       $line =~ s/<\/?em>/\'\'/g;              # em=italic
       $line =~ s/<\/?u>/_/g;                  # underline
       $line =~ s/<\/?sup>/^/g;                # superscript
       $line =~ s/<?sub>/,,/g;                 # subscript
       $line =~ s/<tt>/\{\{\{ /g;              # inline_preformatted_start
       $line =~ s/<\/tt>/ \}\}\}/g;            # inline_preformatted_end

       # Need to replace <nowiki> tags with ! infront of each word
       # The following only affects single word cases
       $line =~ s/<nowiki>(\w+)\s*<\/nowiki>/!$1/g;    # strip_wiki_formatting

       $line =~ s/\\$//g;                      # end_of_line_continuation_removal

       $line =~ s/^\s*$//g;    # blank line (do this before "preformatted")

       $line =~ s/^ (.*)$/\{\{\{ $1 \}\}\}/g;  # preformatted
       $line =~ s/<pre>/\{\{\{ /g;             # preformatted2_start
       $line =~ s/<\/pre>/ \}\}\}/g;           # preformatted2_end

       # lists
       $line =~ s/^\*\*\*/   \* /;             # bullet_list_three_level
       $line =~ s/^\*\*/  \* /;                # bullet_list_two_level
       $line =~ s/^\*/ \* /;                   # bullet_list_one_level

       $line =~ s/^### /   1. /;               # number_list_three_levels
       $line =~ s/^## /   1. /;                # number_list_two_levels
       $line =~ s/^# / 1. /;                   # number_list_one_level

       # definition lists.
       #
       # UseMod:
       # ;;;SomeTerm: SomeDefinition
       # MoinMoin:
       #    SomeTerm: SomeDefintion
       #
       # However, MoinMoin definition list's aren't much use to us, since
       # they don't allow links in the definition title.
       #
       # So we replace definition lists with ul bulleted lists,
       # and indented paragraphs.
       #
       # This replacement also looks for the following pattern at the start
       # of the line: [.*][.*]
       #
       # This is discarded, it was previously used to add anchor points.
       #

       if (0)
       {
       # Proper definition lists
       #$line =~ s/^;;;(.+):(.+)$/   $1:: $2/; # definition_three_levels
       #$line =~ s/^;;(.+):(.+)$/  $1:: $2/;   # definition_two_levels
       #$line =~ s/^;(.+):(.+)$/ $1:: $2/;     # definition_one_levels
       }
       else
       {
               $line =~ s/^;;;\s*(\[.*?\]\[.*?\])?([^:]*):(.+)$/   * $2:\n     $3/;    # definition_three_levels
               $line =~ s/^;;\s*(\[.*?\]\[.*?\])?([^:]*):(.+)$/  * $2:\n    $3/;       # definition_two_levels
               $line =~ s/^;\s*(\[.*?\]\[.*?\])?([^:]*):(.+)$/ * $2:\n   $3/;  # definition_one_level
       }

       $line =~ s/^:::([^:].*)$/   $1/;                # indenting_three_levels
       $line =~ s/^::([^:].*)$/  $1/;          # indenting_two_levels
       $line =~ s/^:([^:].*)$/ $1/;            # indenting_one_level

       # UseMod [#BladiBlah], MoinMoin [[Anchor(BladiBlah)]]

       $line =~ s/\[\#([a-zA-Z0-9 _]+)\]/\[\[Anchor\($1\)\]\]/g; # anchors

       # UseMod [[One]], MoinMoin ["One"].  Force a link to single word wikipage
       $line =~ s/\[\[([A-Z]+[a-z0-9]+)\]\]/\[\"$1\"\]/g; # odd links

       # UseMod [/BladiBlah johoho], MoinMoin [wiki:/BlaDiBlah fancy link]
       $line =~ s/(?:^| )\[(\/[a-zA-Z0-9]+) ([^\]]+)\]/[:$1: $2]/g; # fancy_links_0

       # UseMod [BladiBlah johoho], MoinMoin [wiki:/BlaDiBlah fancy link]
       $line =~ s/(?:^| )\[([A-Z]+[a-z0-9]+[A-Z]+[a-zA-Z0-9]+) ([^\]]+)\]/[:$1: $2]/g; # fancy_links_0

       # UseMod [Bla di _da johoho], MoinMoin ["Bla di _da johoho"]
       # (but not [wiki: ], and watch for [[ by a ' ' prefix
       $line =~ s/(?:^| )\[([^w\]\[][a-zA-Z0-9 _]+)\]/["$1"]/g; # fancy_links_0_2

       # UseMod [[BlaDiBlah | fancy link]], MoinMoin [:BlaDiBlah: fancy link]
       $line =~ s/\[\[(\/?[a-zA-Z0-9]+) *\| *([^\]]+)\]\]/[:$1:$2]/g; # fancy_links_1

       # Usemod [[bladlaslsla]], MoinMoin [" "]
       #$line =~ s/\[\[([a-zA-Z0-9 _]+)\]\]/["$1"]/g; # fancy_links_2 # Not needed? JohnC

       # Usemod [[blah]], MoinMoin ["blah"]
       $line =~ s/\[\[([a-zA-Z0-9 _]+)\]\]/\[\"$1\"\]/g;

       # this was too strict...
       #$line =~ s/\[\[([A-Z][a-z]+[A-Z][a-zA-Z]+) *\| ([^\]]+)\]\]/[wiki:$1 $2]/g; # fancy_links_1

       # UseMod allows DDASDSaDASLeas as wiki name, Moin is more strict
       # (watch for fancy_links_2 by looking for a ' ' prefix)
       $line =~ s/ ([A-Z][A-Z]+[a-z0-9]+[A-Z]+[A-Za-z0-9]*)/ ["$1"]/g; # fancy_links_4
       $line =~ s/ ([A-Z]+[a-z0-9]+[A-Z][A-Z]+[A-Za-z0-9]*)/ ["$1"]/g; # fancy_links_5

       # UseMod forces links using ""link"", Moin uses ''''''link''''''
       $line =~ s/""/''''''/g; # fancy_links_5

       #
       # Replace html entities with literals
       #

       $line =~ s/&nbsp;/ /g; # " "
       $line =~ s/&ndash;/-/g; # "-"
       $line =~ s/&bull;/•/g; #bullet

       $line =~ s/&#[xX]([A-Fa-f0-9]+);/"\&\#". hex($1) . ";"/eg; # convert any hex entities to decimal

       $line =~ s/\&\#([0-9]+)\;/chr($1)/eg; # convert numerical entities to literals

       #
       # Links
       #

       if ($isTitle)
       {
               #
               # Report images & links in titles
               #

               if ($line =~ /http\:.+\.(gif|png|jpg|jpeg) /)
               {
                       $line =~ s/(.*?)(http\:.+\.)(gif|png|jpg|jpeg)(.*?)/$2$3\n$1 $4/g;

                       $ImageInTitle{$lineNum} = $line;
               }

               if ($line =~ /\[.*\]/)
               {
                       $LinkInTitle{$lineNum} = $line;
               }

               if ($line =~ /[A-Z]+[a-z0-9]+[A-Z]+[a-z0-9]/)
               {
                       $WikiNameInTitle{$lineNum} = $line;
               }

               if ($line =~ /\&\#([0-9]+)\;/)
               {
                       $EntityInTitle{$lineNum} = $line;
               }
       }

       if ($line =~ /\[.*\&\#([0-9]+)\;.*\]/)
       {
               $EntityInLink{$lineNum} = $line;
       }

       return $line;
}

sub PostIfDiff
{
       my $wikiName = shift;
       my $portedPage = shift;

       my $url_raw = $MoinMoinBaseUrl . $wikiName . '?action=raw';

       $Mech->get( $url_raw );
       my $existingPage = $Mech->content();

# Don't end line of print statements
       if ($existingPage eq $portedPage)
       {
               print "No change to $wikiName";
               push (@UnchangedPages, $wikiName);
       }
       else
       {
               my $url_edit = $MoinMoinBaseUrl . $wikiName . '?action=edit';

               die "$wikiName is empty" unless ("" ne $portedPage);

               $Mech->get( $url_edit );
               my $frm = $Mech->form_number(2);
               die "couldn't find form" unless defined $frm;

               use Encode qw( encode );
               $portedPage = encode("utf-8", $portedPage);

               $Mech->field('savetext', $portedPage, 1);
               $Mech->submit();

               if ($existingPage eq "")
               {
# Check the page was created sucessfully
# (there is an issue with the creation of some pages)

                       $Mech->get( $url_raw );
                       my $postedPage = $Mech->content();

                       if ("" eq $postedPage)
                       {
# The page wasn't created - report this.
                               print "ERROR: Couldn't Create $wikiName";
                               push (@UnCreatedPages, $wikiName);
                       }
                       else
                       {
                               print "Created $wikiName";
                               push (@CreatedPages, $wikiName);
                       }
               }
               else
               {
                       print "Updated $wikiName";
                       push (@UpdatedPages, $wikiName);
               }
       }
}

sub MoinMoinLogin
{
       my $url = $MoinMoinBaseUrl . 'UserPreferences';

       $Mech->get( $url );
       my $frm = $Mech->form_number(3);
       die "couldn't find form" unless defined $frm;

       $Mech->field('username', $BotUserName, 1);
       $Mech->field('password', $BotPassword, 1);
       $Mech->click('login');
}

sub MoinMoinLogout
{
       my $url = $MoinMoinBaseUrl . 'UserPreferences';

       $Mech->get( $url );
       my $frm = $Mech->form_number(3);
       die "couldn't find form" unless defined $frm;

       $Mech->click('logout');
}

sub CreateBrokenMarkupReport
{
       # This creates a report that will be cat'd to be bottom of each
       # MoinMoin WikiPage that needs manual work from an editor.

       my $report = "";

       my @imageKeys = keys (%ImageInTitle);
       my @linkKeys = keys (%LinkInTitle);
       my @wikinameKeys = keys (%WikiNameInTitle);
       my @entityTitleKeys = keys (%EntityInTitle);
       my @entityLinkKeys = keys (%EntityInLink);
       if ((@imageKeys != 0)
                       || (@linkKeys != 0)
                       || (@wikinameKeys != 0)
                       || (@entityTitleKeys != 0)
                       || (@entityLinkKeys != 0))
       {
               $report .= "## Delete this section once the page has been fixed\n";
               $report .= "----\n";
               $report .= "\/!\\ The Markup on This Page Needs Fixing\n\n";
               $report .= "This wiki page has been ported by the WikiMigrationBot, and this link to the WikiMigrationBotReport flags that this page contains wiki markup that needs fixing.\n\n";

               if (@imageKeys != 0)
               {
                       $report .= "ImageInTitle``s Moved on these lines:\n";
                       foreach my $lineNum (sort @imageKeys)
                       {
                               my $lineText = $ImageInTitle{$lineNum};
                               $lineText =~ s/\s$//g;  # strip the newline
                               $report .= " * $lineNum \{\{\{ $lineText \}\}\}\n";
                       }
                       $report .= "\n";
               }

               if (@linkKeys != 0)
               {
                       $report .= "LinkInTitle``s on these lines:\n";
                       foreach my $lineNum (sort @linkKeys)
                       {
                               my $lineText = $LinkInTitle{$lineNum};
                               $lineText =~ s/\s$//g;  # strip the newline
                               $report .= " * $lineNum \{\{\{ $lineText \}\}\}\n";
                       }
                       $report .= "\n";
               }

               if (@wikinameKeys != 0)
               {
                       $report .= "WikiNameInTitle``s on these lines:\n";
                       foreach my $lineNum (sort @wikinameKeys)
                       {
                               my $lineText = $WikiNameInTitle{$lineNum};
                               $lineText =~ s/\s$//g;  # strip the newline
                               $report .= " * $lineNum \{\{\{ $lineText \}\}\}\n";
                       }
                       $report .= "\n";
               }

               if (@entityTitleKeys != 0)
               {
                       $report .= "EntityInTitle``s on these lines:\n";
                       foreach my $lineNum (sort @entityTitleKeys)
                       {
                               my $lineText = $EntityInTitle{$lineNum};
                               $lineText =~ s/\s$//g;  # strip the newline
                               $report .= " * $lineNum \{\{\{ $lineText \}\}\}\n";
                       }
                       $report .= "\n";
               }

               if (@entityLinkKeys != 0)
               {
                       $report .= "EntityInLink``s on these lines:\n";
                       foreach my $lineNum (sort @entityLinkKeys)
                       {
                               my $lineText = $EntityInLink{$lineNum};
                               $lineText =~ s/\s$//g;  # strip the newline
                               $report .= " * $lineNum \{\{\{ $lineText \}\}\}\n";
                       }
                       $report .= "\n";
               }

               $report .= "----\n";
               $report .= "## End of section to be deleted\n";
       }

       return $report;
}

sub ReportPageCounts
{
       my $action = shift;
       my @nameList = @_;
       my $pageCount = @nameList;

       print "==============================\n";
       print "$pageCount pages $action:\n";
       print "@nameList\n";
       print "==============================\n";
}

